• Jump To … +
    LICENSE.md README.md index.md ast.js identifiers.js parser.js stats.js trace.js utilities.js
  • §

    Overview of the Parser Class

  • §

    The Parser class serves as the core interface to the apg-unicode parsing engine. It encapsulates the primary parse function and exposes modular facilities for enhanced parsing diagnostics and customization.

  • §

    Design Philosophy

  • §
    • Modular Diagnostics: AST, Trace, and Stats are opt-in, allowing lightweight parsing when needed and deep introspection when desired.
    • Callback-Driven Customization: Callback functions can be attached to Rule and UDT names for input string translation during the pass through the parse tree.
    • Semantic Clarity: Each method is purpose-built to expose a distinct parsing concern, following the single-responsibility principle.
    import { identifiers as id } from '../src/identifiers.js';
    import { utilities as utils } from '../src/utilities.js';
    export { Parser };
    class Parser {
      #FILENAME = 'parser.js: ';
      #charType;
      #ast;
      #stats;
      #trace;
      #lookAhead;
      #treeDepth;
      #maxTreeDepth;
      #nodeHits;
      #maxMatched;
      #rules;
      #udts;
      #opcodes;
      #chars;
      #charStart;
      #charEnd;
      #ruleCallbacks;
      #udtCallbacks;
      #userData;
      #sysData = {};
  • §

    The constructor takes a single argument, an SABNF grammar object. See any of the examples in the examples directory for usage.

      constructor(grammar) {
        this.#rules = grammar.rules;
        this.#udts = grammar.udts;
        this.#ruleCallbacks = Array(this.#rules.length).fill(undefined);
        this.#udtCallbacks = this.#udts.length ? Array(this.#udts.length).fill(undefined) : undefined;
      }
  • §

    Attach a Trace object to the parser. The parser will interact with the Trace object generating a trace of the parser through all of the nodes of the parse tree. See examples/trace for usage.

      setTrace(trace) {
        if (!trace) {
          this.#trace = undefined;
        } else {
          this.#trace = trace;
        }
      }
  • §

    Attach a Stats object to the parser. The parser will interact with the Stats object generating a collection of node hits by the parser. See examples/stats for usage.

      setStats(stats) {
        if (!stats) {
          this.#stats = undefined;
        } else {
          this.#stats = stats;
        }
      }
  • §

    Attach an Ast object to the parser. The parser will interact with the Ast object generating an AST. See examples/ast for usage.

      setAst(ast) {
        if (!ast) {
          this.#ast = undefined;
        } else {
          this.#ast = ast;
          this.#ast.initGrammar(this.#rules, this.#udts);
        }
      }
  • §

    Attach a callback function to a rule or UDT named in the SABNF grammar.

  • §
    • @param {string | undefined} name - a valid rule or UDT name. If undefined all callback functions will be cleared.
    • @param {function} callback - a user-written function for processing the node phrases
      setCallback(name, callback) {
        if (name === undefined) {
          /* clear all callbacks */
          this.#ruleCallbacks = Array(this.#rules.length).fill(undefined);
          this.#udtCallbacks = Array(this.#udts.length).fill(undefined);
          return;
        }
        if (!(typeof name === 'string' && typeof callback === 'function')) {
          throw new Error(`${this.#FILENAME}: setCallback() argument types not "string", "function"`);
        }
        /* see if this is a rule name */
        const lower = name.toLowerCase();
        for (const rule of this.#rules) {
          if (lower === rule.lower) {
            this.#ruleCallbacks[rule.index] = callback;
            return;
          }
        }
        /* see if this is a UDT name */
        for (const udt of this.#udts) {
          if (lower === udt.lower) {
            this.#udtCallbacks[udt.index] = callback;
            return;
          }
        }
        throw new Error(`${this.#FILENAME}: setCallback name not a rule name or UDT name: ${name}`);
      }
  • §

    Parse an input string against the configured SABNF grammar.

  • §
    • @param {string} startName - Name of the start rule.
    • @param {*} input - The input string. May be of type:
      • string(converted internally to Uint32Array of codepoints)
      • Array
      • Buffer
      • Uint8Array
      • Uint16Array
      • Uint32Array
    • @param {*} callbackData - User callback data. Not used by the parser but available to callback functions for the application’s use.
    • @param {number} startChar - Index of the first character (inclusive) of the substring to parse. (default = 0)
    • @param {number} endChar - Index of the last character (exclusive) of the substring to parse. (default = input.length)
    • Note: startChar and endChar follow the same rules as for String.prototype.slice().
    • @returns - An object with the parser results:
      • success: true if the state is MATCH or EMPTY and the matched characters is the string or substring length
      • state: the final state identifier
      • stateName: the final state name, MATCH, EMPTY, or NOMATCH
      • length: the length of the substring being parsed, charEnd - charStart
      • matched: the number of characters matched
      • maxMatched: the maximum number of characters matched
      • maxTreeDepth: the maximum tree depth reached
      • nodeHits: the number of nodes visited
      parse(startName, input, callbackData, startChar, endChar) {
        const FUNCTION_NAME = `${this.#FILENAME}parse(): `;
        /* check for all UDT's defined */
        for (const udt of this.#udts) {
          if (!this.#udtCallbacks[udt.index]) {
            throw new Error(`${this.#FILENAME}UDT callback not defined: ${udt.name}`);
          }
        }
    
        this.#reset();
    
        /* validate input string type */
        this.#chars = input;
        if (typeof input === 'string') {
          this.#charType = id.UINT32;
          this.#chars = utils.stringToCodePoints(input);
        } else if (input instanceof Uint8Array) {
          this.#charType = id.UINT8;
        } else if (input instanceof Uint16Array) {
          this.#charType = id.UINT16;
        } else if (input instanceof Uint32Array) {
          this.#charType = id.UINT32;
        } else if (Array.isArray(input)) {
          this.#charType = id.ARRAY;
        } else {
          throw new Error(
            `${FUNCTION_NAME} invalid input - must string, Array, Buffer, Uint8Array, Uint16Array or Uin32Array`
          );
        }
    
        /* get the sub-string, if any */
        const ret = utils.sliceInterval(this.#chars.length, startChar, endChar);
        this.#charStart = ret.indexStart;
        this.#charEnd = ret.indexEnd;
    
        /* Find start rule index (case-insensitive) */
        const r = this.#rules;
        const lower = startName.toLowerCase();
        let startIndex = undefined;
        for (const rule of this.#rules) {
          if (lower === rule.lower) {
            startIndex = rule.index;
            break;
          }
        }
        if (startIndex === undefined) {
          throw new Error(`${FUNCTION_NAME}start name not a valid grammar rule name: '${startName}'`);
        }
    
        /* Initialize AST/stats/trace if provided */
        if (this.#trace) {
          this.#trace.init(this.#rules, this.#udts, this.#chars, this.#charEnd, this.#charType);
        }
        if (this.#stats) {
          this.#stats.init(this.#rules, this.#udts);
        }
        if (this.#ast) {
          this.#ast.initChars(this.#chars);
        }
    
        /* Create a dummy opcode for the root (start rule) node and begin parsing */
        this.#opcodes = [
          {
            type: id.RNM,
            index: startIndex,
          },
        ];
        this.#userData = callbackData;
        this.#opExecute(0, this.#charStart);
        this.#opcodes = undefined;
    
        /* Determine final result from sysData */
        let success = false;
        switch (this.#sysData.state) {
          case id.ACTIVE:
            throw new Error(`${FUNCTION_NAME}final state should never be 'ACTIVE'`);
          case id.NOMATCH:
            success = false;
            break;
          case id.EMPTY:
          case id.MATCH:
            success = this.#sysData.phraseLength === this.#charEnd - this.#charStart;
            break;
          default:
            /* should never get here, but just in case */
            throw new Error(`${FUNCTION_NAME}unrecognized final state: ${this.#sysData.state}`);
        }
        return {
          success,
          state: this.#sysData.state,
          stateName: id.idName(this.#sysData.state),
          length: this.#charEnd - this.#charStart,
          matched: this.#sysData.phraseLength,
          maxMatched: this.#maxMatched - this.#charStart,
          maxTreeDepth: this.#maxTreeDepth,
          nodeHits: this.#nodeHits,
        };
      }
  • §

    Resets the internal state variables. Called by the parser before each parse.

      #reset() {
        this.#lookAhead = 0;
        this.#treeDepth = 0;
        this.#maxTreeDepth = 0;
        this.#nodeHits = 0;
        this.#maxMatched = 0;
        this.#opcodes = undefined;
        this.#chars = undefined;
        this.#charStart = 0;
        this.#charEnd = 0;
        this.#sysData.state = id.ACTIVE;
        this.#sysData.phraseLength = 0;
        this.#userData = undefined;
      }
  • §

    For each node visit in the parse tree, executes the appropriate operation. Having a single point of execution for each node hit facilitates trace and stats collection as well as other parsing statistics reported in the return object.

      #opExecute(opIndex, phraseIndex) {
        const FUNCTION_NAME = `${this.#FILENAME}#opExecute(): `;
        const op = this.#opcodes[opIndex];
        this.#nodeHits++;
        if (this.#treeDepth > this.#maxTreeDepth) {
          this.#maxTreeDepth = this.#treeDepth;
        }
        this.#treeDepth++;
        this.#sysData.state = id.ACTIVE;
        this.#sysData.phraseLength = 0;
        if (this.#trace) {
          this.#trace.down(op, phraseIndex);
        }
        switch (op.type) {
          case id.ALT:
            this.#opALT(opIndex, phraseIndex);
            break;
          case id.CAT:
            this.#opCAT(opIndex, phraseIndex);
            break;
          case id.REP:
            this.#opREP(opIndex, phraseIndex);
            break;
          case id.RNM:
            this.#opRNM(opIndex, phraseIndex);
            break;
          case id.TRG:
            this.#opTRG(opIndex, phraseIndex);
            break;
          case id.TBS:
            this.#opTBS(opIndex, phraseIndex);
            break;
          case id.TLS:
            this.#opTLS(opIndex, phraseIndex);
            break;
          case id.UDT:
            this.#opUDT(opIndex, phraseIndex);
            break;
          case id.AND:
            this.#opAND(opIndex, phraseIndex);
            break;
          case id.NOT:
            this.#opNOT(opIndex, phraseIndex);
            break;
          default:
            throw new Error(`${FUNCTION_NAME}unrecognized operator`);
        }
        if (!this.#lookAhead) {
          if (phraseIndex + this.#sysData.phraseLength > this.#maxMatched) {
            this.#maxMatched = phraseIndex + this.#sysData.phraseLength;
          }
        }
        if (this.#stats) {
          this.#stats.collect(op, this.#sysData);
        }
        if (this.#trace) {
          this.#trace.up(op, this.#sysData.state, phraseIndex, this.#sysData.phraseLength);
        }
        this.#treeDepth--;
      }
  • §

    The alteration ALT operator.
    Executes its child nodes, from left to right, until it finds a match. Fails if all of its child nodes fail.

      #opALT(opIndex, phraseIndex) {
        const op = this.#opcodes[opIndex];
        for (let i = 0; i < op.children.length; i += 1) {
          this.#opExecute(op.children[i], phraseIndex);
          if (this.#sysData.state !== id.NOMATCH) {
            break;
          }
        }
      }
  • §

    The concatenation CAT operator.
    Executes all of its child nodes, from left to right, concatenating the matched phrases. Fails if any child nodes fail.

      #opCAT(opIndex, phraseIndex) {
        const op = this.#opcodes[opIndex];
        let success = true;
        let catCharIndex = phraseIndex;
        let catPhrase = 0;
        let astLength;
        if (this.#ast) {
          astLength = this.#ast.getLength();
        }
        for (let i = 0; i < op.children.length; i += 1) {
          this.#opExecute(op.children[i], catCharIndex);
          if (this.#sysData.state === id.NOMATCH) {
            success = false;
            break;
          } else {
            catCharIndex += this.#sysData.phraseLength;
            catPhrase += this.#sysData.phraseLength;
          }
        }
        if (success) {
          this.#sysData.state = catPhrase === 0 ? id.EMPTY : id.MATCH;
          this.#sysData.phraseLength = catPhrase;
        } else {
          this.#sysData.state = id.NOMATCH;
          this.#sysData.phraseLength = 0;
          if (this.#ast) {
            this.#ast.setLength(astLength);
          }
        }
      }
  • §

    The repetion REP operator.
    Repeatedly executes its single child node, concatenating each of the matched phrases found. The number of repetitions executed and its final sysData depends on its min & max repetition values. Zero repetitions (0*0RuleName or 0RuleName) will represent an empty string but is deprecated.

      #opREP(opIndex, phraseIndex) {
        const op = this.#opcodes[opIndex];
        if (op.max === 0) {
          /* this is an empty-string acceptor
           * deprecated: use the TLS empty string operator, "", instead
           */
          this.#sysData.state = id.EMPTY;
          this.#sysData.phraseLength = 0;
          return;
        }
        let repCharIndex = phraseIndex;
        let repPhrase = 0;
        let repCount = 0;
        let astLength;
        if (this.#ast) {
          astLength = this.#ast.getLength();
        }
        while (1) {
          if (repCharIndex >= this.#charEnd) {
            /* exit on end of input string */
            break;
          }
          this.#opExecute(opIndex + 1, repCharIndex);
          if (this.#sysData.state === id.NOMATCH) {
            /* always end if the child node fails */
            break;
          }
          if (this.#sysData.state === id.EMPTY) {
            /* REP always succeeds when the child node returns an empty phrase */
            /* this may not seem obvious, but that's the way it works out */
            break;
          }
          repCount += 1;
          repPhrase += this.#sysData.phraseLength;
          repCharIndex += this.#sysData.phraseLength;
          if (repCount === op.max) {
            /* end on maxed out reps */
            break;
          }
        }
        /* evaluate the match count according to the min, max values */
        if (this.#sysData.state === id.EMPTY) {
          this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH;
          this.#sysData.phraseLength = repPhrase;
        } else if (repCount >= op.min) {
          this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH;
          this.#sysData.phraseLength = repPhrase;
        } else {
          this.#sysData.state = id.NOMATCH;
          this.#sysData.phraseLength = 0;
          if (this.#ast) {
            this.#ast.setLength(astLength);
          }
        }
      }
  • §

    Validate the rule callback function’s returned sysData values. It’s the application’s responsibility to get them right but RNM fails if not.

      #validateRnmCallbackResult = (rule, sysData, charsLeft, down) => {
        let FUNCTION_NAME = `${this.#FILENAME}opRNM `;
        if (sysData.phraseLength > charsLeft) {
          let str = `${FUNCTION_NAME}${rule.name}: callback function error: `;
          str += `sysData.phraseLength: ${sysData.phraseLength}`;
          str += ` must be <= remaining chars: ${charsLeft}`;
          throw new Error(str);
        }
        switch (sysData.state) {
          case id.ACTIVE:
            if (!down) {
              throw new Error(
                `${FUNCTION_NAME}opRNM(${rule.name}): callback function return error. ACTIVE state not allowed.`
              );
            }
            break;
          case id.EMPTY:
            sysData.phraseLength = 0;
            break;
          case id.MATCH:
            if (sysData.phraseLength === 0) {
              sysData.state = id.EMPTY;
            }
            break;
          case id.NOMATCH:
            sysData.phraseLength = 0;
            break;
          default:
            throw new Error(
              `${FUNCTION_NAME}(${rule.name}): callback function return error. Unrecognized return state: ${sysData.state}`
            );
        }
      };
  • §

    The rule name RNM operator.
    This operator will acts as a root node for a parse tree branch below and returns the matched phrase to its parent. However, its larger responsibility is handling user-defined callback functions and AST nodes. Note that the AST is a separate object, but RNM calls its functions to create its nodes.

      #opRNM(opIndex, phraseIndex) {
        let astLength;
        let astDefined;
        let savedOpcodes;
        const op = this.#opcodes[opIndex];
        const rule = this.#rules[op.index];
        const callback = this.#ruleCallbacks[rule.index];
        /* ignore AST in look ahead (AND or NOT operator above) */
        if (!this.#lookAhead) {
          astDefined = this.#ast && this.#ast.ruleDefined(op.index);
          if (astDefined) {
            astLength = this.#ast.getLength();
            this.#ast.down(op.index, rule.name);
          }
        }
        if (callback) {
          /* call application's callback going down the parse tree*/
          const charsLeft = this.#charEnd - phraseIndex;
          callback(this.#sysData, this.#chars, phraseIndex, this.#userData);
          this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, true);
          if (this.#sysData.state === id.ACTIVE) {
            savedOpcodes = this.#opcodes;
            this.#opcodes = rule.opcodes;
            this.#opExecute(0, phraseIndex);
            this.#opcodes = savedOpcodes;
            /* call application's callback going up the parse tree*/
            callback(this.#sysData, this.#chars, phraseIndex, this.#userData);
            this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, false);
          } /* implied else clause: just accept the callback sysData - RNM acting as UDT */
        } else {
          /* no callback - just execute the rule */
          savedOpcodes = this.#opcodes;
          this.#opcodes = rule.opcodes;
          this.#opExecute(0, phraseIndex);
          this.#opcodes = savedOpcodes;
        }
        if (!this.#lookAhead) {
          /* end AST */
          if (astDefined) {
            if (this.#sysData.state === id.NOMATCH) {
              this.#ast.setLength(astLength);
            } else {
              this.#ast.up(op.index, rule.name, phraseIndex, this.#sysData.phraseLength);
            }
          }
        }
      }
  • §

    Validate the UDT callback function’s returned sysData values. It’s the application’s responsibility to get it right but UDT fails if not.

      #validateUdtCallbackResult = (udt, sysData, charsLeft) => {
        const FUNCTION_NAME = `${this.#FILENAME}opUDT`;
        if (sysData.phraseLength > charsLeft) {
          let str = `${FUNCTION_NAME}(${udt.name}): callback function error: `;
          str += `sysData.phraseLength: ${sysData.phraseLength}`;
          str += ` must be <= remaining chars: ${charsLeft}`;
          throw new Error(str);
        }
        switch (sysData.state) {
          case id.ACTIVE:
            throw new Error(`${FUNCTION_NAME}(${udt.name}) ACTIVE state return not allowed.`);
          case id.EMPTY:
            if (udt.empty) {
              sysData.phraseLength = 0;
            } else {
              throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`);
            }
            break;
          case id.MATCH:
            if (sysData.phraseLength === 0) {
              if (udt.empty) {
                sysData.state = id.EMPTY;
              } else {
                throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`);
              }
            }
            break;
          case id.NOMATCH:
            sysData.phraseLength = 0;
            break;
          default:
            throw new Error(
              `${FUNCTION_NAME}(${udt.name}): callback function return error. Unrecognized return state: ${sysData.state}`
            );
        }
      };
  • §

    The User-Define Terminal UDT operator.
    Simply calls the application’s callback function, but operates like RNM with regard to the AST. There is some ambiguity here. UDTs act as terminals for phrase recognition but as named rules for AST nodes.

      #opUDT(opIndex, phraseIndex) {
        let astLength;
        let astIndex;
        let astDefined;
        const op = this.#opcodes[opIndex];
        const udt = this.#udts[op.index];
        this.#sysData.UdtIndex = udt.index;
        /* ignore AST in look ahead */
        if (!this.#lookAhead) {
          astDefined = this.#ast && this.#ast.udtDefined(op.index);
          if (astDefined) {
            astIndex = this.#rules.length + op.index;
            astLength = this.#ast.getLength();
            this.#ast.down(astIndex, udt.name);
          }
        }
        /* call the UDT */
        const charsLeft = this.#charEnd - phraseIndex;
        this.#udtCallbacks[op.index](this.#sysData, this.#chars, phraseIndex, this.#userData);
        this.#validateUdtCallbackResult(udt, this.#sysData, charsLeft);
        if (!this.#lookAhead) {
          /* end AST */
          if (astDefined) {
            if (this.#sysData.state === id.NOMATCH) {
              this.#ast.setLength(astLength);
            } else {
              this.#ast.up(astIndex, udt.name, phraseIndex, this.#sysData.phraseLength);
            }
          }
        }
      }
  • §

    The Terminal Literal String TLS operator.
    Matches its pre-defined phrase against the input string. A case-insensitive match is attempted for ASCII alphbetical characters. The TLS explicitly allows empty phrases. This is the preferred method of explicitly representing an empty string.

      #opTLS(opIndex, phraseIndex) {
        let code;
        const op = this.#opcodes[opIndex];
        this.#sysData.state = id.NOMATCH;
        const len = op.string.length;
        if (len === 0) {
          /* EMPTY match allowed for TLS */
          this.#sysData.state = id.EMPTY;
          return;
        }
        if (phraseIndex + len <= this.#charEnd) {
          for (let i = 0; i < len; i += 1) {
            code = this.#chars[phraseIndex + i];
            if (code >= 65 && code <= 90) {
              code += 32;
            }
            if (code !== op.string[i]) {
              return;
            }
          }
          this.#sysData.state = id.MATCH;
          this.#sysData.phraseLength = len;
        } /* implied else NOMATCH */
      }
  • §

    The Terminal Binary StringTBS operator.
    Matches its pre-defined phrase against the input string. All characters must match exactly. Case-sensitive literal strings ('string' & %s"string") are translated to TBS operators by apg. ‘’ or %s”” will represent an empty string but is deprecated.

      #opTBS(opIndex, phraseIndex) {
        const op = this.#opcodes[opIndex];
        const len = op.string.length;
        this.#sysData.state = id.NOMATCH;
        if (phraseIndex + len <= this.#charEnd) {
          for (let i = 0; i < len; i += 1) {
            if (this.#chars[phraseIndex + i] !== op.string[i]) {
              return;
            }
          }
          this.#sysData.state = id.MATCH;
          this.#sysData.phraseLength = len;
        } /* implied else NOMATCH */
      }
  • §

    The Terminal RangeTRG operator.
    Succeeds if the single first character of the phrase is within the min - max range.

      #opTRG(opIndex, phraseIndex) {
        const op = this.#opcodes[opIndex];
        this.#sysData.state = id.NOMATCH;
        if (phraseIndex < this.#charEnd) {
          if (op.min <= this.#chars[phraseIndex] && this.#chars[phraseIndex] <= op.max) {
            this.#sysData.state = id.MATCH;
            this.#sysData.phraseLength = 1;
          }
        }
      }
  • §

    The positive lookahead, AND, operator.
    Executes its single child node, returning the EMPTY state if it succeeds and NOMATCH if it fails. Always backtracks on any matched phrase and returns EMPTY on success.

      #opAND(opIndex, phraseIndex) {
        const FUNCTION_NAME = `${this.#FILENAME}opAND: `;
        this.#lookAhead++;
        this.#opExecute(opIndex + 1, phraseIndex);
        this.#lookAhead--;
        this.#sysData.phraseLength = 0;
        switch (this.#sysData.state) {
          case id.EMPTY:
            this.#sysData.state = id.EMPTY;
            break;
          case id.MATCH:
            this.#sysData.state = id.EMPTY;
            break;
          case id.NOMATCH:
            this.#sysData.state = id.NOMATCH;
            break;
          default:
            throw new Error(`${FUNCTION_NAME}invalid state ${sysData.state}`);
        }
      }
  • §

    The negative look ahead, NOT, operator.
    Executes its single child node, returning the EMPTY state if it fails and NOMATCH if it succeeds. Always backtracks on any matched phrase and returns EMPTY on success (failure of its child node).

      #opNOT(opIndex, phraseIndex) {
        this.#lookAhead++;
        this.#opExecute(opIndex + 1, phraseIndex);
        this.#lookAhead--;
        this.#sysData.phraseLength = 0;
        switch (this.#sysData.state) {
          case id.EMPTY:
          case id.MATCH:
            this.#sysData.state = id.NOMATCH;
            break;
          case id.NOMATCH:
            this.#sysData.state = id.EMPTY;
            break;
          default:
            throw new Error(`${this.#FILENAME}opNOT: invalid state ${sysData.state}`);
        }
      }
    }